********************************************************************************
* This program generates the main regression dataset
*
* Authors: Fowlie and Reguant
* August 2020
********************************************************************************

clear all 
set more off, perm

cap mkdir $buildpath/output/deflator

* CREATES GENERAL GDP DEFLATOR *************************************************
import delimited "$buildpath/input/deflator/GDPDEF.csv", encoding(ISO-8859-1)clear
gen year = real(substr(date,1,4))
collapse (mean) gdpdef, by(year)
summ gdpdef if year == 2007
replace gdpdef = gdpdef/r(mean)
label var gdpdef "GDP Deflator in 2007 $"
save "$buildpath/output/deflator/deflator_year.dta", replace


* CREATES INDUSTRY-SPECIFIC DEFLATOR *******************************************
import delimited "$buildpath/input/deflator/ppi_bls.txt", clear delim(tab)

* removing subproducts
replace series_id = subinstr(series_id ," ","" ,.) 
gen len_series = strlen(series_id )
keep if len_series == 15 | (len_series==16 & substr(series_id,-1,1)=="P")

* generating naics6 variable
gen naics = substr(series_id , 4,6)
replace naics = subinstr(naics,"-","",.)
destring naics, force replace ignore("X" "R" "M" "T" "K")
drop if naics==.

* collapsing at the yearly level
collapse (mean) value, by(naics year)
gen value_temp = value if year == 2007
bys naics: egen value_2007 = mean(value_temp)
replace value = value/value_2007
drop value_temp value_2007

* saving file
rename value ppidef
label var ppidef "BLS PPI Deflator in 2007 Dollars"
save "$buildpath/output/deflator/deflator_industry_year.dta", replace

* compare to NBER deflator and fill if missing
use "$buildpath/input/shipments/NBER_shipments_naics1997_1958-2011.dta", clear
keep naics year piship 
keep if year >= 1997
gen value_temp = piship if year == 2007
bys naics: egen value_2007= mean(value_temp)
replace piship = piship/value_2007
drop value_temp value_2007
merge 1:1 naics year using "$buildpath/output/deflator/deflator_industry_year.dta"
drop _merge
drop if year < 1997
corr piship ppidef
sort naics year
gen flag = 1 if ppidef==. & piship != .
label var flag "Index missing at BLS, obtained from NBER CES"
replace ppidef = piship if flag==1

* interpolate if one year missing
tsset naics year
tsfill
sort naics year
by naics: ipolate ppidef year, gen(ppidef_ipol)

keep naics year ppidef* flag
save "$buildpath/output/deflator/deflator_industry_year.dta", replace
